This is a case study for Google Data Analyst Certificate:
Bellabeat is a high-tech company that manufactures health-focused smart products. It is a successful small company, but they have the potential to become a larger player in the global smart device market. Since it was founded in 2013, Bellabeat has grown rapidly and quickly positioned itself as a tech-driven wellness company for women
Our task is to focus on a Bellabeat product and analyze smart device usage data in order to gain insight into how people are already using the smart devices. Then, using this information, we need to recommend how these trends can help Bellabeat marketing strategy.
library(plotly)
library(tidyverse) #wrangle data
library(dplyr) #clean data
library(lubridate) #wrangle date attributes
library(skimr) #get summary data
library(ggplot2) #visualize data
library(cowplot) #grid the plot
library(readr) #save csv
library(tidyr) #for organizing tabular data
library(janitor) #for data examination and cleaning
daily_activity <- read.csv("dailyActivity_merged.csv")
sleep_day <- read.csv("sleepDay_merged.csv")
weight <- read.csv("weightLogInfo_merged.csv")
hourly_steps <- read.csv("hourlySteps_merged.csv")
Examining the first few rows of every data set
head(daily_activity)
head(sleep_day)
head(weight)
head(hourly_steps)
sum(is.na(daily_activity))
[1] 0
sum(is.na(sleep_day))
[1] 0
sum(is.na(weight))
[1] 65
sum(is.na(hourly_steps))
[1] 0
sleep_day have 3 duplicates
#CHECK FOR DUPLICATES
sum(duplicated(daily_activity))
[1] 0
sum(duplicated(sleep_day))
[1] 3
sum(duplicated(weight))
[1] 0
sum(duplicated(hourly_steps))
[1] 0
daily activity extra 3 users ,sleep day less 6 users, weight less 22 users
#daily activity extra 3 users ,sleep day less 6 users, weight less 22 users
n_distinct(daily_activity$Id)
[1] 33
n_distinct(sleep_day$Id)
[1] 24
n_distinct(weight$Id)
[1] 8
n_distinct(hourly_steps$Id)
[1] 33
Romove duplicates from sleep_day
sleep_day <- sleep_day[!duplicated(sleep_day), ]
#check
sum(duplicated(sleep_day))
[1] 0
Convert Activity Date into date format and add a column for day of the week
Add column for hours in hourly_steps
hourly_steps$hour <- as.POSIXct(hourly_steps$ActivityHour,format=('%h'))
We will classify the average user’s sleep into three patterns
sleep_day_new <- sleep_day %>% group_by(Id) %>%
summarise(avg_time_sleep= mean(TotalMinutesAsleep)) %>%
mutate(Categores=case_when(
avg_time_sleep < 300 ~ "Unhealthy Sleep",
avg_time_sleep >= 320 & avg_time_sleep <= 420 ~ "Normal Sleep",
avg_time_sleep > 420 ~ "Healthy Sleep"))
We will separate observations into fitness groups based on walking lifestyle: “Sedentary, Needs Improvment, Active, Highly Active”.
Steps_categores <- daily_activity %>% group_by(Id) %>%
summarise(avg_steps=mean(TotalSteps)) %>%
mutate(level_steps= case_when(
avg_steps < 5000 ~ "Sedentary",
avg_steps >=5000 & avg_steps < 10000 ~ "Needsimprovment",
avg_steps >=10000 & avg_steps < 12500 ~ "Active",
avg_steps >=12500 ~ "Highly active"))
marge data table ()
merged1 <- merge(daily_activity,sleep_day,by = c("Id"),all= TRUE)
data_marged <- merge(merged1,weight,by = c("Id"),all = TRUE)
Convert Activity Date into date format and add a column for day of the week
data_marged <- data_marged %>%
mutate( weekday = weekdays(as.Date(ActivityDate,"%m/%d/%Y")))
#Arrange the days of the week to use in chart
data_marged$weekday <- factor(data_marged$weekday, levels= c("Sunday","Monday","Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
Statistics summary mean, median, min, max for Data_marged
data_marged %>%
dplyr:: select(TotalDistance,TotalMinutesAsleep,TotalSteps,TotalTimeInBed,Calories,weekday,WeightPounds,BMI) %>%
summary()
TotalDistance TotalMinutesAsleep TotalSteps TotalTimeInBed Calories weekday
Min. : 0.000 Min. : 58.0 Min. : 0 Min. : 61.0 Min. : 0 Sunday :5610
1st Qu.: 3.910 1st Qu.:400.0 1st Qu.: 5832 1st Qu.:421.0 1st Qu.:1850 Monday :5609
Median : 6.820 Median :442.0 Median :10199 Median :457.0 Median :2046 Tuesday :7004
Mean : 6.415 Mean :433.8 Mean : 9373 Mean :458.2 Mean :2103 Wednesday:6988
3rd Qu.: 8.350 3rd Qu.:477.0 3rd Qu.:12109 3rd Qu.:510.0 3rd Qu.:2182 Thursday :6930
Max. :28.030 Max. :796.0 Max. :36019 Max. :961.0 Max. :4900 Friday :5632
NA's :971 NA's :971 Saturday :5616
WeightPounds BMI
Min. :116.0 Min. :21.45
1st Qu.:134.9 1st Qu.:23.89
Median :135.6 Median :24.00
Mean :139.6 Mean :24.42
3rd Qu.:136.7 3rd Qu.:24.21
Max. :294.3 Max. :47.54
NA's :8881 NA's :8881
Let’s look at how active users are per hour in total steps. From 5 p.m. to 7 p.m. users take the most steps
ggplot(data= hourly_steps,aes(x=hour, y=StepTotal,fill=hour))+geom_bar(stat = "identity")+ labs(title = "Steps by Hour",x="Hours",Y="Steps")
Let’s look at categories users are per steps .54.5% from users Needs Improvment
plot_ly(Steps_categores,labels= ~level_steps, value=~avg_steps,type = 'pie', textposition = 'outside',textinfo = 'label+percent') %>%
layout(title= 'Users categories by steps')
ggplot(data = data_marged, aes(x=weekday,y=TotalSteps))+geom_bar(stat="identity",fill='steelblue')+labs(title= 'Weeklyday Steps',x="Weekday",y="Steps")
Let’s look at categories users are per average sleep hours .
sleep_day_new$Categores <- factor(sleep_day_new$Categores, levels= c("Unhealthy Sleep","Normal Sleep","Healthy Sleep"))
plot_ly(sleep_day_new,labels= ~ Categores, value= ~ avg_time_sleep,type = "pie", textposition = 'outside',textinfo = 'label+percent') %>%
layout(title='usere categories by hourly sleep')
The more active you are, the more steps you take, and the more calories you’ll burn. This is an obvious fact, but we can still look at the data to find anything interesting. Here we see that some users have similar weights, but some of them burn more than 2500 calories and nearly 20,000 steps to reach a weight of 60 kg, others only need to burn more than 1500 calories and only about 10,000 steps, and there are those who weigh up to 120 kg and they can burn over 2000 calories with much fewer steps even less than 5000 steps
ggplot(data=data_marged,aes(x=TotalSteps,y=Calories,color=WeightKg))+ geom_point()+stat_smooth(method = lm)+
scale_color_gradient(low = 'green',high = 'red')+labs(title = "Steps vs Calories",x="Steps",y="Calories")
`geom_smooth()` using formula 'y ~ x'
And about the rumor that a person who sleeps a lot burns fewer calories, we note that there is no significant correlation between sleeping more and burning fewer calories.
ggplot(data=data_marged,aes(x=Calories,y=TotalMinutesAsleep,color=TotalMinutesAsleep))+ geom_point()+stat_smooth(method = lm)+
scale_color_gradient(low = 'green',high = 'red')+labs(title = "Sleep Vs Calories",X="Calories" ,y="Sleep Minutes")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 971 rows containing non-finite values (stat_smooth).
Warning: Removed 971 rows containing missing values (geom_point).